Revert "Replace the tokenizer with a flex-based scanner (#3846)"

This reverts commit 99eaf5faf9.
This commit is contained in:
Ashe Connor
2017-11-10 10:22:47 +11:00
committed by Ashe Connor
parent 0f4955e5d5
commit 0698b0f36e
15 changed files with 202 additions and 8914 deletions

View File

@@ -275,8 +275,10 @@ module Linguist
# also--importantly--without having to duplicate many (potentially
# large) strings.
begin
data.split(encoded_newlines_re, -1)
encoded_newlines = ["\r\n", "\r", "\n"].
map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) }
data.split(Regexp.union(encoded_newlines), -1)
rescue Encoding::ConverterNotFoundError
# The data is not splittable in the detected encoding. Assume it's
# one big line.
@@ -287,51 +289,6 @@ module Linguist
end
end
def encoded_newlines_re
@encoded_newlines_re ||= Regexp.union(["\r\n", "\r", "\n"].
map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
end
def first_lines(n)
return lines[0...n] if defined? @lines
return [] unless viewable? && data
i, c = 0, 0
while c < n && j = data.index(encoded_newlines_re, i)
i = j + $&.length
c += 1
end
data[0...i].split(encoded_newlines_re, -1)
end
def last_lines(n)
if defined? @lines
if n >= @lines.length
@lines
else
lines[-n..-1]
end
end
return [] unless viewable? && data
no_eol = true
i, c = data.length, 0
k = i
while c < n && j = data.rindex(encoded_newlines_re, i - 1)
if c == 0 && j + $&.length == i
no_eol = false
n += 1
end
i = j
k = j + $&.length
c += 1
end
r = data[k..-1].split(encoded_newlines_re, -1)
r.pop if !no_eol
r
end
# Public: Get number of lines of code
#
# Requires Blob#data

View File

@@ -3,8 +3,6 @@ require 'linguist/tokenizer'
module Linguist
# Language bayesian classifier.
class Classifier
CLASSIFIER_CONSIDER_BYTES = 50 * 1024
# Public: Use the classifier to detect language of the blob.
#
# blob - An object that quacks like a blob.
@@ -19,7 +17,7 @@ module Linguist
# Returns an Array of Language objects, most probable first.
def self.call(blob, possible_languages)
language_names = possible_languages.map(&:name)
classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _|
classify(Samples.cache, blob.data, language_names).map do |name, _|
Language[name] # Return the actual Language objects
end
end

View File

@@ -23,21 +23,21 @@ module Linguist
#
# Returns a String like '100644'
def mode
@mode ||= File.stat(@fullpath).mode.to_s(8)
File.stat(@fullpath).mode.to_s(8)
end
# Public: Read file contents.
#
# Returns a String.
def data
@data ||= File.read(@fullpath)
File.read(@fullpath)
end
# Public: Get byte size
#
# Returns an Integer.
def size
@size ||= File.size(@fullpath)
File.size(@fullpath)
end
end
end

View File

@@ -1,8 +1,6 @@
module Linguist
# A collection of simple heuristics that can be used to better analyze languages.
class Heuristics
HEURISTICS_CONSIDER_BYTES = 50 * 1024
# Public: Use heuristics to detect language of the blob.
#
# blob - An object that quacks like a blob.
@@ -16,7 +14,7 @@ module Linguist
#
# Returns an Array of languages, or empty if none matched or were inconclusive.
def self.call(blob, candidates)
data = blob.data[0...HEURISTICS_CONSIDER_BYTES]
data = blob.data
@heuristics.each do |heuristic|
if heuristic.matches?(blob.name, candidates)
@@ -74,14 +72,6 @@ module Linguist
# Common heuristics
ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/
CPlusPlusRegex = Regexp.union(
/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/,
/^\s*template\s*</,
/^[ \t]*try/,
/^[ \t]*catch\s*\(/,
/^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/,
/^[ \t]*(private|public|protected):$/,
/std::\w+/)
disambiguate ".as" do |data|
if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data)
@@ -229,7 +219,8 @@ module Linguist
disambiguate ".h" do |data|
if ObjectiveCRegex.match(data)
Language["Objective-C"]
elsif CPlusPlusRegex.match(data)
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
/^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data))
Language["C++"]
end
end

View File

@@ -109,8 +109,8 @@ module Linguist
# Returns an Array with one Language if the blob has a Vim or Emacs modeline
# that matches a Language name or alias. Returns an empty array if no match.
def self.call(blob, _ = nil)
header = blob.first_lines(SEARCH_SCOPE).join("\n")
footer = blob.last_lines(SEARCH_SCOPE).join("\n")
header = blob.lines.first(SEARCH_SCOPE).join("\n")
footer = blob.lines.last(SEARCH_SCOPE).join("\n")
Array(Language.find_by_alias(modeline(header + footer)))
end

View File

@@ -1,5 +1,4 @@
require 'strscan'
require 'linguist/linguist'
module Linguist
# Generic programming language tokenizer.
@@ -16,5 +15,191 @@ module Linguist
def self.tokenize(data)
new.extract_tokens(data)
end
# Read up to 100KB
BYTE_LIMIT = 100_000
# Start state on token, ignore anything till the next newline
SINGLE_LINE_COMMENTS = [
'//', # C
'--', # Ada, Haskell, AppleScript
'#', # Ruby
'%', # Tex
'"', # Vim
]
# Start state on opening token, ignore anything until the closing
# token is reached.
MULTI_LINE_COMMENTS = [
['/*', '*/'], # C
['<!--', '-->'], # XML
['{-', '-}'], # Haskell
['(*', '*)'], # Coq
['"""', '"""'], # Python
["'''", "'''"] # Python
]
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
"\s*#{Regexp.escape(c)} "
}.join("|"))
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
Regexp.escape(c[0])
}.join("|"))
# Internal: Extract generic tokens from data.
#
# data - String to scan.
#
# Examples
#
# extract_tokens("printf('Hello')")
# # => ['printf', '(', ')']
#
# Returns Array of token Strings.
def extract_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
break if s.pos >= BYTE_LIMIT
if token = s.scan(/^#!.+$/)
if name = extract_shebang(token)
tokens << "SHEBANG#!#{name}"
end
# Single line comment
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
# tokens << token.strip
s.skip_until(/\n|\Z/)
# Multiline comments
elsif token = s.scan(START_MULTI_LINE_COMMENT)
# tokens << token
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
# tokens << close_token
# Skip single or double quoted strings
elsif s.scan(/"/)
if s.peek(1) == "\""
s.getch
else
s.skip_until(/(?<!\\)"/)
end
elsif s.scan(/'/)
if s.peek(1) == "'"
s.getch
else
s.skip_until(/(?<!\\)'/)
end
# Skip number literals
elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
# SGML style brackets
elsif token = s.scan(/<[^\s<>][^<>]*>/)
extract_sgml_tokens(token).each { |t| tokens << t }
# Common programming punctuation
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
tokens << token
# Regular token
elsif token = s.scan(/[\w\.@#\/\*]+/)
tokens << token
# Common operators
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
tokens << token
else
s.getch
end
end
tokens
end
# Internal: Extract normalized shebang command token.
#
# Examples
#
# extract_shebang("#!/usr/bin/ruby")
# # => "ruby"
#
# extract_shebang("#!/usr/bin/env node")
# # => "node"
#
# extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
# # => "awk"
#
# Returns String token or nil it couldn't be parsed.
def extract_shebang(data)
s = StringScanner.new(data)
if path = s.scan(/^#!\s*\S+/)
script = path.split('/').last
if script == 'env'
s.scan(/\s+/)
s.scan(/.*=[^\s]+\s+/)
script = s.scan(/\S+/)
end
script = script[/[^\d]+/, 0] if script
return script
end
nil
end
# Internal: Extract tokens from inside SGML tag.
#
# data - SGML tag String.
#
# Examples
#
# extract_sgml_tokens("<a href='' class=foo>")
# # => ["<a>", "href="]
#
# Returns Array of token Strings.
def extract_sgml_tokens(data)
s = StringScanner.new(data)
tokens = []
until s.eos?
# Emit start token
if token = s.scan(/<\/?[^\s>]+/)
tokens << "#{token}>"
# Emit attributes with trailing =
elsif token = s.scan(/\w+=/)
tokens << token
# Then skip over attribute value
if s.scan(/"/)
s.skip_until(/[^\\]"/)
elsif s.scan(/'/)
s.skip_until(/[^\\]'/)
else
s.skip_until(/\w+/)
end
# Emit lone attributes
elsif token = s.scan(/\w+/)
tokens << token
# Stop at the end of the tag
elsif s.scan(/>/)
s.terminate
else
s.getch
end
end
tokens
end
end
end