Skip emiting comment tokens

This commit is contained in:
Joshua Peek
2012-08-20 10:34:07 -05:00
parent 657adaabec
commit 220108857c
2 changed files with 17 additions and 14 deletions

View File

@@ -16,12 +16,15 @@ module Linguist
new.extract_tokens(data) new.extract_tokens(data)
end end
# Start state on token, ignore anything till the next newline
SINGLE_LINE_COMMENTS = [ SINGLE_LINE_COMMENTS = [
'//', # C '//', # C
'#', # Ruby '#', # Ruby
'%', # Tex '%', # Tex
] ]
# Start state on opening token, ignore anything until the closing
# token is reached.
MULTI_LINE_COMMENTS = [ MULTI_LINE_COMMENTS = [
['/*', '*/'], # C ['/*', '*/'], # C
['<!--', '-->'], # XML ['<!--', '-->'], # XML
@@ -30,7 +33,7 @@ module Linguist
] ]
START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c| START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
"^\s*#{Regexp.escape(c)} " "\s*#{Regexp.escape(c)} "
}.join("|")) }.join("|"))
START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c| START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
@@ -58,16 +61,16 @@ module Linguist
end end
# Single line comment # Single line comment
elsif token = s.scan(START_SINGLE_LINE_COMMENT) elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT)
tokens << token.strip # tokens << token.strip
s.skip_until(/\n|\Z/) s.skip_until(/\n|\Z/)
# Multiline comments # Multiline comments
elsif token = s.scan(START_MULTI_LINE_COMMENT) elsif token = s.scan(START_MULTI_LINE_COMMENT)
tokens << token # tokens << token
close_token = MULTI_LINE_COMMENTS.assoc(token)[1] close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
s.skip_until(Regexp.compile(Regexp.escape(close_token))) s.skip_until(Regexp.compile(Regexp.escape(close_token)))
tokens << close_token # tokens << close_token
# Skip single or double quoted strings # Skip single or double quoted strings
elsif s.scan(/"/) elsif s.scan(/"/)

View File

@@ -34,15 +34,15 @@ class TestTokenizer < Test::Unit::TestCase
end end
def test_skip_comments def test_skip_comments
assert_equal %w(foo #), tokenize("foo\n# Comment") assert_equal %w(foo), tokenize("foo\n# Comment")
assert_equal %w(foo # bar), tokenize("foo\n# Comment\nbar") assert_equal %w(foo bar), tokenize("foo\n# Comment\nbar")
assert_equal %w(foo //), tokenize("foo\n// Comment") assert_equal %w(foo), tokenize("foo\n// Comment")
assert_equal %w(foo /* */), tokenize("foo /* Comment */") assert_equal %w(foo), tokenize("foo /* Comment */")
assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */") assert_equal %w(foo), tokenize("foo /* \nComment\n */")
assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->") assert_equal %w(foo), tokenize("foo <!-- Comment -->")
assert_equal %w(foo {- -}), tokenize("foo {- Comment -}") assert_equal %w(foo), tokenize("foo {- Comment -}")
assert_equal %w(foo \(* *\)), tokenize("foo (* Comment *)") assert_equal %w(foo), tokenize("foo (* Comment *)")
assert_equal %w(% %), tokenize("2 % 10\n% Comment") assert_equal %w(%), tokenize("2 % 10\n% Comment")
end end
def test_sgml_tags def test_sgml_tags