diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index 5682173b..ef3d54b7 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -16,12 +16,15 @@ module Linguist new.extract_tokens(data) end + # Start state on token, ignore anything till the next newline SINGLE_LINE_COMMENTS = [ '//', # C '#', # Ruby '%', # Tex ] + # Start state on opening token, ignore anything until the closing + # token is reached. MULTI_LINE_COMMENTS = [ ['/*', '*/'], # C [''], # XML @@ -30,7 +33,7 @@ module Linguist ] START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c| - "^\s*#{Regexp.escape(c)} " + "\s*#{Regexp.escape(c)} " }.join("|")) START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c| @@ -58,16 +61,16 @@ module Linguist end # Single line comment - elsif token = s.scan(START_SINGLE_LINE_COMMENT) - tokens << token.strip + elsif s.beginning_of_line? && token = s.scan(START_SINGLE_LINE_COMMENT) + # tokens << token.strip s.skip_until(/\n|\Z/) # Multiline comments elsif token = s.scan(START_MULTI_LINE_COMMENT) - tokens << token + # tokens << token close_token = MULTI_LINE_COMMENTS.assoc(token)[1] s.skip_until(Regexp.compile(Regexp.escape(close_token))) - tokens << close_token + # tokens << close_token # Skip single or double quoted strings elsif s.scan(/"/) diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb index 4fb49a4a..af649204 100644 --- a/test/test_tokenizer.rb +++ b/test/test_tokenizer.rb @@ -34,15 +34,15 @@ class TestTokenizer < Test::Unit::TestCase end def test_skip_comments - assert_equal %w(foo #), tokenize("foo\n# Comment") - assert_equal %w(foo # bar), tokenize("foo\n# Comment\nbar") - assert_equal %w(foo //), tokenize("foo\n// Comment") - assert_equal %w(foo /* */), tokenize("foo /* Comment */") - assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */") - assert_equal %w(foo ), tokenize("foo ") - assert_equal %w(foo {- -}), tokenize("foo {- Comment -}") - assert_equal %w(foo \(* *\)), tokenize("foo (* Comment *)") - assert_equal %w(% %), tokenize("2 % 10\n% Comment") + assert_equal %w(foo), tokenize("foo\n# Comment") + assert_equal %w(foo bar), tokenize("foo\n# Comment\nbar") + assert_equal %w(foo), tokenize("foo\n// Comment") + assert_equal %w(foo), tokenize("foo /* Comment */") + assert_equal %w(foo), tokenize("foo /* \nComment\n */") + assert_equal %w(foo), tokenize("foo ") + assert_equal %w(foo), tokenize("foo {- Comment -}") + assert_equal %w(foo), tokenize("foo (* Comment *)") + assert_equal %w(%), tokenize("2 % 10\n% Comment") end def test_sgml_tags