Extract single and multi line comment parser

2025-12-29 21:31:01 +00:00 · 2012-07-20 15:06:21 -05:00
parent d063089430
commit 175d4244c2
2 changed files with 33 additions and 39 deletions
--- a/lib/linguist/tokenizer.rb
+++ b/lib/linguist/tokenizer.rb
@@ -24,6 +24,27 @@ module Linguist
      extract_tokens(data)
    end

+    SINGLE_LINE_COMMENTS = [
+      '//', # C
+      '#',  # Ruby
+      '%',  # Tex
+    ]
+
+    MULTI_LINE_COMMENTS = [
+      ['/*', '*/'],    # C
+      ['<!--', '-->'], # XML
+      ['{-', '-}'],    # Haskell
+      ['(*', '*)']     # Coq
+    ]
+
+    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
+      "^\s*#{Regexp.escape(c)} "
+    }.join("|"))
+
+    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
+      Regexp.escape(c[0])
+    }.join("|"))
+
    # Internal: Extract generic tokens from data.
    #
    # data - String to scan.
@@ -39,44 +60,17 @@ module Linguist

      tokens = []
      until s.eos?
-        # Ruby single line comment
-        if token = s.scan(/# /)
-          tokens << "#"
+        # Single line comment
+        if token = s.scan(START_SINGLE_LINE_COMMENT)
+          tokens << token.strip
          s.skip_until(/\n|\Z/)

-        # C style single line comment
-        elsif token = s.scan(/\/\/ /)
-          tokens << "//"
-          s.skip_until(/\n|\Z/)
-
-        # Leading Tex or Matlab comments
-        elsif token = s.scan(/\n%/)
-          tokens << "%"
-          s.skip_until(/\n|\Z/)
-
-        # C multiline comments
-        elsif token = s.scan(/\/\*/)
-          tokens << "/*"
-          s.skip_until(/\*\//)
-          tokens << "*/"
-
-        # Haskell multiline comments
-        elsif token = s.scan(/\{-/)
-          tokens << "{-"
-          s.skip_until(/-\}/)
-          tokens << "-}"
-
-        # XML multiline comments
-        elsif token = s.scan(/<!--/)
-          tokens << "<!--"
-          s.skip_until(/-->/)
-          tokens << "-->"
-
-        # Coq multiline comments
-        elsif token = s.scan(/\(\*/)
-          tokens << "(*"
-          s.skip_until(/\*\)/)
-          tokens << "*)"
+        # Multiline comments
+        elsif token = s.scan(START_MULTI_LINE_COMMENT)
+          tokens << token
+          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
+          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
+          tokens << close_token

        # Skip single or double quoted strings
        elsif s.scan(/"/)
--- a/test/test_tokenizer.rb
+++ b/test/test_tokenizer.rb
@@ -29,9 +29,9 @@ class TestTokenizer < Test::Unit::TestCase
  end

  def test_skip_comments
-    assert_equal %w(foo #), tokenize("foo # Comment")
-    assert_equal %w(foo # bar), tokenize("foo # Comment\nbar")
-    assert_equal %w(foo //), tokenize("foo // Comment")
+    assert_equal %w(foo #), tokenize("foo\n# Comment")
+    assert_equal %w(foo # bar), tokenize("foo\n# Comment\nbar")
+    assert_equal %w(foo //), tokenize("foo\n// Comment")
    assert_equal %w(foo /* */), tokenize("foo /* Comment */")
    assert_equal %w(foo /* */), tokenize("foo /* \nComment\n */")
    assert_equal %w(foo <!-- -->), tokenize("foo <!-- Comment -->")