Extract single and multi line comment parser

2026-08-01 14:32:24 +00:00 · 2012-07-20 15:06:21 -05:00
parent d063089430
commit 175d4244c2
2 changed files with 33 additions and 39 deletions
@@ -24,6 +24,27 @@ module Linguist
      extract_tokens(data)
    end

+    SINGLE_LINE_COMMENTS = [
+      '//', # C
+      '#',  # Ruby
+      '%',  # Tex
+    ]
+
+    MULTI_LINE_COMMENTS = [
+      ['/*', '*/'],    # C
+      ['<!--', '-->'], # XML
+      ['{-', '-}'],    # Haskell
+      ['(*', '*)']     # Coq
+    ]
+
+    START_SINGLE_LINE_COMMENT =  Regexp.compile(SINGLE_LINE_COMMENTS.map { |c|
+      "^\s*#{Regexp.escape(c)} "
+    }.join("|"))
+
+    START_MULTI_LINE_COMMENT =  Regexp.compile(MULTI_LINE_COMMENTS.map { |c|
+      Regexp.escape(c[0])
+    }.join("|"))
+
    # Internal: Extract generic tokens from data.
    #
    # data - String to scan.
@@ -39,44 +60,17 @@ module Linguist

      tokens = []
      until s.eos?
-        # Ruby single line comment
-        if token = s.scan(/# /)
-          tokens << "#"
+        # Single line comment
+        if token = s.scan(START_SINGLE_LINE_COMMENT)
+          tokens << token.strip
          s.skip_until(/\n|\Z/)

-        # C style single line comment
-        elsif token = s.scan(/\/\/ /)
-          tokens << "//"
-          s.skip_until(/\n|\Z/)
-
-        # Leading Tex or Matlab comments
-        elsif token = s.scan(/\n%/)
-          tokens << "%"
-          s.skip_until(/\n|\Z/)
-
-        # C multiline comments
-        elsif token = s.scan(/\/\*/)
-          tokens << "/*"
-          s.skip_until(/\*\//)
-          tokens << "*/"
-
-        # Haskell multiline comments
-        elsif token = s.scan(/\{-/)
-          tokens << "{-"
-          s.skip_until(/-\}/)
-          tokens << "-}"
-
-        # XML multiline comments
-        elsif token = s.scan(/<!--/)
-          tokens << "<!--"
-          s.skip_until(/-->/)
-          tokens << "-->"
-
-        # Coq multiline comments
-        elsif token = s.scan(/\(\*/)
-          tokens << "(*"
-          s.skip_until(/\*\)/)
-          tokens << "*)"
+        # Multiline comments
+        elsif token = s.scan(START_MULTI_LINE_COMMENT)
+          tokens << token
+          close_token = MULTI_LINE_COMMENTS.assoc(token)[1]
+          s.skip_until(Regexp.compile(Regexp.escape(close_token)))
+          tokens << close_token

        # Skip single or double quoted strings
        elsif s.scan(/"/)