Extract single and multi line comment parser

2025-10-29 17:50:22 +00:00 · 2012-07-20 15:06:21 -05:00
parent d063089430
commit 175d4244c2
2 changed files with 33 additions and 39 deletions
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -24,6 +24,27 @@ module Linguist
      extract_tokens(data)
    end
    SINGLE_LINE_COMMENTS = [
      '//', # C
      '#',  # Ruby
      '%',  # Tex
    ]
    MULTI_LINE_COMMENTS = [
      ['/*', '*/'],    # C
      ['<!--', '-->'], # XML
      ['{-', '-}'],    # Haskell
      ['(*', '*)']     # Coq
    ]
    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
      "^\s*#{Regexp.escape(c)} "
    }.join("|"))
    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
      Regexp.escape(c[0])
    }.join("|"))
    # Internal: Extract generic tokens from data.
    #
    # data - String to scan.
@@ -39,44 +60,17 @@ module Linguist
      tokens = []
      until s.eos?
-        # Ruby single line comment
+        # Single line comment
-        if token = s.scan(/# /)
+        if token = s.scan(START_SINGLE_LINE_COMMENT)
-          tokens << "#"
+          tokens << token.strip
          s.skip_until(/\n|\Z/)
-        # C style single line comment
+        # Multiline comments
-        elsif token = s.scan(/\/\/ /)
+        elsif token = s.scan(START_MULTI_LINE_COMMENT)
-          tokens << "//"
+          tokens << token
-          s.skip_until(/\n|\Z/)
+          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
-
+          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
-        # Leading Tex or Matlab comments
+          tokens << close_token
        elsif token = s.scan(/\n%/)
          tokens << "%"
          s.skip_until(/\n|\Z/)
        # C multiline comments
        elsif token = s.scan(/\/\*/)
          tokens << "/*"
          s.skip_until(/\*\//)
          tokens << "*/"
        # Haskell multiline comments
        elsif token = s.scan(/\{-/)
          tokens << "{-"
          s.skip_until(/-\}/)
          tokens << "-}"
        # XML multiline comments
        elsif token = s.scan(/<!--/)
          tokens << "<!--"
          s.skip_until(/-->/)
          tokens << "-->"
        # Coq multiline comments
        elsif token = s.scan(/\(\*/)
          tokens << "(*"
          s.skip_until(/\*\)/)
          tokens << "*)"
        # Skip single or double quoted strings
        elsif s.scan(/"/)
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -29,9 +29,9 @@ class TestTokenizer < Test::Unit::TestCase
  end
  def test_skip_comments
-    assert_equal %w(foo #), tokenize("foo # Comment")
+    assert_equal %w(foo #), tokenize("foo\n# Comment")
-    assert_equal %w(foo # bar), tokenize("foo # Comment\nbar")
+    assert_equal %w(foo # bar), tokenize("foo\n# Comment\nbar")
-    assert_equal %w(foo //), tokenize("foo // Comment")
+    assert_equal %w(foo //), tokenize("foo\n// Comment")
    assert_equal %w(foo /* */), tokenize("foo /* Comment */")
    assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
    assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")