Extract single and multi line comment parser

This commit is contained in:
Joshua Peek
2012-07-20 15:06:21 -05:00
parent d063089430
commit 175d4244c2
2 changed files with 33 additions and 39 deletions

View File

@@ -24,6 +24,27 @@ module Linguist
extract_tokens(data)
end
SINGLE_LINE_COMMENTS = [
'//', # C
'#', # Ruby
'%', # Tex
]
MULTI_LINE_COMMENTS = [
['/*', '*/'], # C
['<!--', '-->'], # XML
['{-', '-}'], # Haskell
['(*', '*)'] # Coq
]
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
"^\s*#{Regexp.escape(c)} "
}.join("|"))
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
Regexp.escape(c[0])
}.join("|"))
# Internal: Extract generic tokens from data.
#
# data - String to scan.
@@ -39,44 +60,17 @@ module Linguist
tokens = []
until s.eos?
# Ruby single line comment
if token = s.scan(/# /)
tokens << "#"
# Single line comment
if token = s.scan(START_SINGLE_LINE_COMMENT)
tokens << token.strip
s.skip_until(/\n|\Z/)
# C style single line comment
elsif token = s.scan(/\/\/ /)
tokens << "//"
s.skip_until(/\n|\Z/)
# Leading Tex or Matlab comments
elsif token = s.scan(/\n%/)
tokens << "%"
s.skip_until(/\n|\Z/)
# C multiline comments
elsif token = s.scan(/\/\*/)
tokens << "/*"
s.skip_until(/\*\//)
tokens << "*/"
# Haskell multiline comments
elsif token = s.scan(/\{-/)
tokens << "{-"
s.skip_until(/-\}/)
tokens << "-}"
# XML multiline comments
elsif token = s.scan(/<!--/)
tokens << "<!--"
s.skip_until(/-->/)
tokens << "-->"
# Coq multiline comments
elsif token = s.scan(/\(\*/)
tokens << "(*"
s.skip_until(/\*\)/)
tokens << "*)"
# Multiline comments
elsif token = s.scan(START_MULTI_LINE_COMMENT)
tokens << token
close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
s.skip_until(Regexp.compile(Regexp.escape(close_token)))
tokens << close_token
# Skip single or double quoted strings
elsif s.scan(/"/)

View File

@@ -29,9 +29,9 @@ class TestTokenizer < Test::Unit::TestCase
end
def test_skip_comments
assert_equal %w(foo #), tokenize("foo # Comment")
assert_equal %w(foo # bar), tokenize("foo # Comment\nbar")
assert_equal %w(foo //), tokenize("foo // Comment")
assert_equal %w(foo #), tokenize("foo\n# Comment")
assert_equal %w(foo # bar), tokenize("foo\n# Comment\nbar")
assert_equal %w(foo //), tokenize("foo\n// Comment")
assert_equal %w(foo /* */), tokenize("foo /* Comment */")
assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")