mirror of
https://github.com/KevinMidboe/linguist.git
synced 2025-10-29 17:50:22 +00:00
Extract single and multi line comment parser
This commit is contained in:
@@ -24,6 +24,27 @@ module Linguist
|
||||
extract_tokens(data)
|
||||
end
|
||||
|
||||
SINGLE_LINE_COMMENTS = [
|
||||
'//', # C
|
||||
'#', # Ruby
|
||||
'%', # Tex
|
||||
]
|
||||
|
||||
MULTI_LINE_COMMENTS = [
|
||||
['/*', '*/'], # C
|
||||
['<!--', '-->'], # XML
|
||||
['{-', '-}'], # Haskell
|
||||
['(*', '*)'] # Coq
|
||||
]
|
||||
|
||||
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
|
||||
"^\s*#{Regexp.escape(c)} "
|
||||
}.join("|"))
|
||||
|
||||
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
|
||||
Regexp.escape(c[0])
|
||||
}.join("|"))
|
||||
|
||||
# Internal: Extract generic tokens from data.
|
||||
#
|
||||
# data - String to scan.
|
||||
@@ -39,44 +60,17 @@ module Linguist
|
||||
|
||||
tokens = []
|
||||
until s.eos?
|
||||
# Ruby single line comment
|
||||
if token = s.scan(/# /)
|
||||
tokens << "#"
|
||||
# Single line comment
|
||||
if token = s.scan(START_SINGLE_LINE_COMMENT)
|
||||
tokens << token.strip
|
||||
s.skip_until(/\n|\Z/)
|
||||
|
||||
# C style single line comment
|
||||
elsif token = s.scan(/\/\/ /)
|
||||
tokens << "//"
|
||||
s.skip_until(/\n|\Z/)
|
||||
|
||||
# Leading Tex or Matlab comments
|
||||
elsif token = s.scan(/\n%/)
|
||||
tokens << "%"
|
||||
s.skip_until(/\n|\Z/)
|
||||
|
||||
# C multiline comments
|
||||
elsif token = s.scan(/\/\*/)
|
||||
tokens << "/*"
|
||||
s.skip_until(/\*\//)
|
||||
tokens << "*/"
|
||||
|
||||
# Haskell multiline comments
|
||||
elsif token = s.scan(/\{-/)
|
||||
tokens << "{-"
|
||||
s.skip_until(/-\}/)
|
||||
tokens << "-}"
|
||||
|
||||
# XML multiline comments
|
||||
elsif token = s.scan(/<!--/)
|
||||
tokens << "<!--"
|
||||
s.skip_until(/-->/)
|
||||
tokens << "-->"
|
||||
|
||||
# Coq multiline comments
|
||||
elsif token = s.scan(/\(\*/)
|
||||
tokens << "(*"
|
||||
s.skip_until(/\*\)/)
|
||||
tokens << "*)"
|
||||
# Multiline comments
|
||||
elsif token = s.scan(START_MULTI_LINE_COMMENT)
|
||||
tokens << token
|
||||
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
|
||||
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
|
||||
tokens << close_token
|
||||
|
||||
# Skip single or double quoted strings
|
||||
elsif s.scan(/"/)
|
||||
|
||||
@@ -29,9 +29,9 @@ class TestTokenizer < Test::Unit::TestCase
|
||||
end
|
||||
|
||||
def test_skip_comments
|
||||
assert_equal %w(foo #), tokenize("foo # Comment")
|
||||
assert_equal %w(foo # bar), tokenize("foo # Comment\nbar")
|
||||
assert_equal %w(foo //), tokenize("foo // Comment")
|
||||
assert_equal %w(foo #), tokenize("foo\n# Comment")
|
||||
assert_equal %w(foo # bar), tokenize("foo\n# Comment\nbar")
|
||||
assert_equal %w(foo //), tokenize("foo\n// Comment")
|
||||
assert_equal %w(foo /* */), tokenize("foo /* Comment */")
|
||||
assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
|
||||
assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")
|
||||
|
||||
Reference in New Issue
Block a user