mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-12-08 20:38:47 +00:00
Revert "Replace the tokenizer with a flex-based scanner (#3846)"
This reverts commit 99eaf5faf9.
This commit is contained in:
@@ -275,8 +275,10 @@ module Linguist
|
||||
# also--importantly--without having to duplicate many (potentially
|
||||
# large) strings.
|
||||
begin
|
||||
|
||||
data.split(encoded_newlines_re, -1)
|
||||
encoded_newlines = ["\r\n", "\r", "\n"].
|
||||
map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) }
|
||||
|
||||
data.split(Regexp.union(encoded_newlines), -1)
|
||||
rescue Encoding::ConverterNotFoundError
|
||||
# The data is not splittable in the detected encoding. Assume it's
|
||||
# one big line.
|
||||
@@ -287,51 +289,6 @@ module Linguist
|
||||
end
|
||||
end
|
||||
|
||||
def encoded_newlines_re
|
||||
@encoded_newlines_re ||= Regexp.union(["\r\n", "\r", "\n"].
|
||||
map { |nl| nl.encode(ruby_encoding, "ASCII-8BIT").force_encoding(data.encoding) })
|
||||
|
||||
end
|
||||
|
||||
def first_lines(n)
|
||||
return lines[0...n] if defined? @lines
|
||||
return [] unless viewable? && data
|
||||
|
||||
i, c = 0, 0
|
||||
while c < n && j = data.index(encoded_newlines_re, i)
|
||||
i = j + $&.length
|
||||
c += 1
|
||||
end
|
||||
data[0...i].split(encoded_newlines_re, -1)
|
||||
end
|
||||
|
||||
def last_lines(n)
|
||||
if defined? @lines
|
||||
if n >= @lines.length
|
||||
@lines
|
||||
else
|
||||
lines[-n..-1]
|
||||
end
|
||||
end
|
||||
return [] unless viewable? && data
|
||||
|
||||
no_eol = true
|
||||
i, c = data.length, 0
|
||||
k = i
|
||||
while c < n && j = data.rindex(encoded_newlines_re, i - 1)
|
||||
if c == 0 && j + $&.length == i
|
||||
no_eol = false
|
||||
n += 1
|
||||
end
|
||||
i = j
|
||||
k = j + $&.length
|
||||
c += 1
|
||||
end
|
||||
r = data[k..-1].split(encoded_newlines_re, -1)
|
||||
r.pop if !no_eol
|
||||
r
|
||||
end
|
||||
|
||||
# Public: Get number of lines of code
|
||||
#
|
||||
# Requires Blob#data
|
||||
|
||||
@@ -3,8 +3,6 @@ require 'linguist/tokenizer'
|
||||
module Linguist
|
||||
# Language bayesian classifier.
|
||||
class Classifier
|
||||
CLASSIFIER_CONSIDER_BYTES = 50 * 1024
|
||||
|
||||
# Public: Use the classifier to detect language of the blob.
|
||||
#
|
||||
# blob - An object that quacks like a blob.
|
||||
@@ -19,7 +17,7 @@ module Linguist
|
||||
# Returns an Array of Language objects, most probable first.
|
||||
def self.call(blob, possible_languages)
|
||||
language_names = possible_languages.map(&:name)
|
||||
classify(Samples.cache, blob.data[0...CLASSIFIER_CONSIDER_BYTES], language_names).map do |name, _|
|
||||
classify(Samples.cache, blob.data, language_names).map do |name, _|
|
||||
Language[name] # Return the actual Language objects
|
||||
end
|
||||
end
|
||||
|
||||
@@ -23,21 +23,21 @@ module Linguist
|
||||
#
|
||||
# Returns a String like '100644'
|
||||
def mode
|
||||
@mode ||= File.stat(@fullpath).mode.to_s(8)
|
||||
File.stat(@fullpath).mode.to_s(8)
|
||||
end
|
||||
|
||||
# Public: Read file contents.
|
||||
#
|
||||
# Returns a String.
|
||||
def data
|
||||
@data ||= File.read(@fullpath)
|
||||
File.read(@fullpath)
|
||||
end
|
||||
|
||||
# Public: Get byte size
|
||||
#
|
||||
# Returns an Integer.
|
||||
def size
|
||||
@size ||= File.size(@fullpath)
|
||||
File.size(@fullpath)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
module Linguist
|
||||
# A collection of simple heuristics that can be used to better analyze languages.
|
||||
class Heuristics
|
||||
HEURISTICS_CONSIDER_BYTES = 50 * 1024
|
||||
|
||||
# Public: Use heuristics to detect language of the blob.
|
||||
#
|
||||
# blob - An object that quacks like a blob.
|
||||
@@ -16,7 +14,7 @@ module Linguist
|
||||
#
|
||||
# Returns an Array of languages, or empty if none matched or were inconclusive.
|
||||
def self.call(blob, candidates)
|
||||
data = blob.data[0...HEURISTICS_CONSIDER_BYTES]
|
||||
data = blob.data
|
||||
|
||||
@heuristics.each do |heuristic|
|
||||
if heuristic.matches?(blob.name, candidates)
|
||||
@@ -74,14 +72,6 @@ module Linguist
|
||||
|
||||
# Common heuristics
|
||||
ObjectiveCRegex = /^\s*(@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">])/
|
||||
CPlusPlusRegex = Regexp.union(
|
||||
/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/,
|
||||
/^\s*template\s*</,
|
||||
/^[ \t]*try/,
|
||||
/^[ \t]*catch\s*\(/,
|
||||
/^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/,
|
||||
/^[ \t]*(private|public|protected):$/,
|
||||
/std::\w+/)
|
||||
|
||||
disambiguate ".as" do |data|
|
||||
if /^\s*(package\s+[a-z0-9_\.]+|import\s+[a-zA-Z0-9_\.]+;|class\s+[A-Za-z0-9_]+\s+extends\s+[A-Za-z0-9_]+)/.match(data)
|
||||
@@ -229,7 +219,8 @@ module Linguist
|
||||
disambiguate ".h" do |data|
|
||||
if ObjectiveCRegex.match(data)
|
||||
Language["Objective-C"]
|
||||
elsif CPlusPlusRegex.match(data)
|
||||
elsif (/^\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>/.match(data) ||
|
||||
/^\s*template\s*</.match(data) || /^[ \t]*try/.match(data) || /^[ \t]*catch\s*\(/.match(data) || /^[ \t]*(class|(using[ \t]+)?namespace)\s+\w+/.match(data) || /^[ \t]*(private|public|protected):$/.match(data) || /std::\w+/.match(data))
|
||||
Language["C++"]
|
||||
end
|
||||
end
|
||||
|
||||
@@ -109,8 +109,8 @@ module Linguist
|
||||
# Returns an Array with one Language if the blob has a Vim or Emacs modeline
|
||||
# that matches a Language name or alias. Returns an empty array if no match.
|
||||
def self.call(blob, _ = nil)
|
||||
header = blob.first_lines(SEARCH_SCOPE).join("\n")
|
||||
footer = blob.last_lines(SEARCH_SCOPE).join("\n")
|
||||
header = blob.lines.first(SEARCH_SCOPE).join("\n")
|
||||
footer = blob.lines.last(SEARCH_SCOPE).join("\n")
|
||||
Array(Language.find_by_alias(modeline(header + footer)))
|
||||
end
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
require 'strscan'
|
||||
require 'linguist/linguist'
|
||||
|
||||
module Linguist
|
||||
# Generic programming language tokenizer.
|
||||
@@ -16,5 +15,191 @@ module Linguist
|
||||
def self.tokenize(data)
|
||||
new.extract_tokens(data)
|
||||
end
|
||||
|
||||
# Read up to 100KB
|
||||
BYTE_LIMIT = 100_000
|
||||
|
||||
# Start state on token, ignore anything till the next newline
|
||||
SINGLE_LINE_COMMENTS = [
|
||||
'//', # C
|
||||
'--', # Ada, Haskell, AppleScript
|
||||
'#', # Ruby
|
||||
'%', # Tex
|
||||
'"', # Vim
|
||||
]
|
||||
|
||||
# Start state on opening token, ignore anything until the closing
|
||||
# token is reached.
|
||||
MULTI_LINE_COMMENTS = [
|
||||
['/*', '*/'], # C
|
||||
['<!--', '-->'], # XML
|
||||
['{-', '-}'], # Haskell
|
||||
['(*', '*)'], # Coq
|
||||
['"""', '"""'], # Python
|
||||
["'''", "'''"] # Python
|
||||
]
|
||||
|
||||
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
||||
"\s*#{Regexp.escape(c)} "
|
||||
}.join("|"))
|
||||
|
||||
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
||||
Regexp.escape(c[0])
|
||||
}.join("|"))
|
||||
|
||||
# Internal: Extract generic tokens from data.
|
||||
#
|
||||
# data - String to scan.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# extract_tokens("printf('Hello')")
|
||||
# # => ['printf', '(', ')']
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def extract_tokens(data)
|
||||
s = StringScanner.new(data)
|
||||
|
||||
tokens = []
|
||||
until s.eos?
|
||||
break if s.pos >= BYTE_LIMIT
|
||||
|
||||
if token = s.scan(/^#!.+$/)
|
||||
if name = extract_shebang(token)
|
||||
tokens << "SHEBANG#!#{name}"
|
||||
end
|
||||
|
||||
# Single line comment
|
||||
elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
|
||||
# tokens << token.strip
|
||||
s.skip_until(/\n|\Z/)
|
||||
|
||||
# Multiline comments
|
||||
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
||||
# tokens << token
|
||||
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
||||
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
||||
# tokens << close_token
|
||||
|
||||
# Skip single or double quoted strings
|
||||
elsif s.scan(/"/)
|
||||
if s.peek(1) == "\""
|
||||
s.getch
|
||||
else
|
||||
s.skip_until(/(?<!\\)"/)
|
||||
end
|
||||
elsif s.scan(/'/)
|
||||
if s.peek(1) == "'"
|
||||
s.getch
|
||||
else
|
||||
s.skip_until(/(?<!\\)'/)
|
||||
end
|
||||
|
||||
# Skip number literals
|
||||
elsif s.scan(/(0x\h(\h|\.)*|\d(\d|\.)*)([uU][lL]{0,2}|([eE][-+]\d*)?[fFlL]*)/)
|
||||
|
||||
# SGML style brackets
|
||||
elsif token = s.scan(/<[^\s<>][^<>]*>/)
|
||||
extract_sgml_tokens(token).each { |t| tokens << t }
|
||||
|
||||
# Common programming punctuation
|
||||
elsif token = s.scan(/;|\{|\}|\(|\)|\[|\]/)
|
||||
tokens << token
|
||||
|
||||
# Regular token
|
||||
elsif token = s.scan(/[\w\.@#\/\*]+/)
|
||||
tokens << token
|
||||
|
||||
# Common operators
|
||||
elsif token = s.scan(/<<?|\+|\-|\*|\/|%|&&?|\|\|?/)
|
||||
tokens << token
|
||||
|
||||
else
|
||||
s.getch
|
||||
end
|
||||
end
|
||||
|
||||
tokens
|
||||
end
|
||||
|
||||
# Internal: Extract normalized shebang command token.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# extract_shebang("#!/usr/bin/ruby")
|
||||
# # => "ruby"
|
||||
#
|
||||
# extract_shebang("#!/usr/bin/env node")
|
||||
# # => "node"
|
||||
#
|
||||
# extract_shebang("#!/usr/bin/env A=B foo=bar awk -f")
|
||||
# # => "awk"
|
||||
#
|
||||
# Returns String token or nil it couldn't be parsed.
|
||||
def extract_shebang(data)
|
||||
s = StringScanner.new(data)
|
||||
|
||||
if path = s.scan(/^#!\s*\S+/)
|
||||
script = path.split('/').last
|
||||
if script == 'env'
|
||||
s.scan(/\s+/)
|
||||
s.scan(/.*=[^\s]+\s+/)
|
||||
script = s.scan(/\S+/)
|
||||
end
|
||||
script = script[/[^\d]+/, 0] if script
|
||||
return script
|
||||
end
|
||||
|
||||
nil
|
||||
end
|
||||
|
||||
# Internal: Extract tokens from inside SGML tag.
|
||||
#
|
||||
# data - SGML tag String.
|
||||
#
|
||||
# Examples
|
||||
#
|
||||
# extract_sgml_tokens("<a href='' class=foo>")
|
||||
# # => ["<a>", "href="]
|
||||
#
|
||||
# Returns Array of token Strings.
|
||||
def extract_sgml_tokens(data)
|
||||
s = StringScanner.new(data)
|
||||
|
||||
tokens = []
|
||||
|
||||
until s.eos?
|
||||
# Emit start token
|
||||
if token = s.scan(/<\/?[^\s>]+/)
|
||||
tokens << "#{token}>"
|
||||
|
||||
# Emit attributes with trailing =
|
||||
elsif token = s.scan(/\w+=/)
|
||||
tokens << token
|
||||
|
||||
# Then skip over attribute value
|
||||
if s.scan(/"/)
|
||||
s.skip_until(/[^\\]"/)
|
||||
elsif s.scan(/'/)
|
||||
s.skip_until(/[^\\]'/)
|
||||
else
|
||||
s.skip_until(/\w+/)
|
||||
end
|
||||
|
||||
# Emit lone attributes
|
||||
elsif token = s.scan(/\w+/)
|
||||
tokens << token
|
||||
|
||||
# Stop at the end of the tag
|
||||
elsif s.scan(/>/)
|
||||
s.terminate
|
||||
|
||||
else
|
||||
s.getch
|
||||
end
|
||||
end
|
||||
|
||||
tokens
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user