diff --git a/lib/linguist/tokenizer.rb b/lib/linguist/tokenizer.rb index a8afe676..83706064 100644 --- a/lib/linguist/tokenizer.rb +++ b/lib/linguist/tokenizer.rb @@ -24,6 +24,27 @@ module Linguist extract_tokens(data) end + SINGLE_LINE_COMMENTS = [ + '//', # C + '#', # Ruby + '%', # Tex + ] + + MULTI_LINE_COMMENTS = [ + ['/*', '*/'], # C + [''], # XML + ['{-', '-}'], # Haskell + ['(*', '*)'] # Coq + ] + + START_SINGLE_LINE_COMMENT = Regexp.compile(SINGLE_LINE_COMMENTS.map { |c| + "^\s*#{Regexp.escape(c)} " + }.join("|")) + + START_MULTI_LINE_COMMENT = Regexp.compile(MULTI_LINE_COMMENTS.map { |c| + Regexp.escape(c[0]) + }.join("|")) + # Internal: Extract generic tokens from data. # # data - String to scan. @@ -39,44 +60,17 @@ module Linguist tokens = [] until s.eos? - # Ruby single line comment - if token = s.scan(/# /) - tokens << "#" + # Single line comment + if token = s.scan(START_SINGLE_LINE_COMMENT) + tokens << token.strip s.skip_until(/\n|\Z/) - # C style single line comment - elsif token = s.scan(/\/\/ /) - tokens << "//" - s.skip_until(/\n|\Z/) - - # Leading Tex or Matlab comments - elsif token = s.scan(/\n%/) - tokens << "%" - s.skip_until(/\n|\Z/) - - # C multiline comments - elsif token = s.scan(/\/\*/) - tokens << "/*" - s.skip_until(/\*\//) - tokens << "*/" - - # Haskell multiline comments - elsif token = s.scan(/\{-/) - tokens << "{-" - s.skip_until(/-\}/) - tokens << "-}" - - # XML multiline comments - elsif token = s.scan(//) - tokens << "-->" - - # Coq multiline comments - elsif token = s.scan(/\(\*/) - tokens << "(*" - s.skip_until(/\*\)/) - tokens << "*)" + # Multiline comments + elsif token = s.scan(START_MULTI_LINE_COMMENT) + tokens << token + close_token = MULTI_LINE_COMMENTS.assoc(token)[1] + s.skip_until(Regexp.compile(Regexp.escape(close_token))) + tokens << close_token # Skip single or double quoted strings elsif s.scan(/"/) diff --git a/test/test_tokenizer.rb b/test/test_tokenizer.rb index e757c7d7..bbd43872 100644 --- a/test/test_tokenizer.rb +++ b/test/test_tokenizer.rb @@ -29,9 +29,9 @@ class TestTokenizer < Test::Unit::TestCase end def test_skip_comments - assert_equal %w(foo #), tokenize("foo # Comment") - assert_equal %w(foo # bar), tokenize("foo # Comment\nbar") - assert_equal %w(foo //), tokenize("foo // Comment") + assert_equal %w(foo #), tokenize("foo\n# Comment") + assert_equal %w(foo # bar), tokenize("foo\n# Comment\nbar") + assert_equal %w(foo //), tokenize("foo\n// Comment") assert_equal %w(foo /* */), tokenize("foo /* Comment */") assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */") assert_equal %w(foo ), tokenize("foo ")